# Import necessary packages
library(rlang)
library(ggplot2)
library(ggtext)
library(readxl)
library(broom)
library(tidyverse)
library(extrafont)
library(Cairo)
loadfonts(device = "win")
windowsFonts(LM_Roman = windowsFont("LM Roman 12"))

# Set up data (change file path)
setwd("C:/Users/.../leaders-and-twitter_replication-materials")
df <- readxl::read_excel("/data/data-sentiment-model-comparison.xlsx")

# Convert columns to factor type
df$vader_label <- as.factor(df$vader_label)
df$bert_label <- as.factor(df$bert_label)
df$manual_label <- as.factor(df$manual_label)

# Create confusion matrices
conf_mat_vader <- table(df$manual_label, df$vader_label)
conf_mat_roberta <- table(df$manual_label, df$bert_label)

# Convert the confusion matrices to a tidy data frame
conf_mat_vader_df <- as.data.frame(as.table(conf_mat_vader))
names(conf_mat_vader_df) <- c("Manual", "VADER", "Frequency")

conf_mat_roberta_df <- as.data.frame(as.table(conf_mat_roberta))
names(conf_mat_roberta_df) <- c("Manual", "Transformer", "Frequency")


### Vader
plot_vader <- ggplot(data = conf_mat_vader_df, aes(x = Manual, y = VADER, fill = Frequency)) + 
  geom_tile() + 
  geom_text(aes(label = Frequency), vjust = 0.5, color = "black", size = 10, family = "LM_Roman") +
  scale_fill_gradient(low = "white", high = "purple") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(family = "LM_Roman", size = 17.5),
    axis.text.y = element_text(angle = 90, hjust = 0.5, family = "LM_Roman", size = 17.5),
    axis.title.x = element_text(hjust = 0.5, family = "LM_Roman", size = 20),
    axis.title.y = element_text(hjust = 0.5, family = "LM_Roman", size = 20),
    legend.text = element_text(family = "LM_Roman", size = 17.5),
    legend.title = element_text(family = "LM_Roman", size = 17.5)
  ) +
  labs(x = "Manual Labels", y = "VADER Labels")

# Display in RStudio
print(plot_vader)

# Save as image
ggsave(filename = "confusion_matrix_vader.jpg", plot = plot_vader, width = 6, height = 6, dpi = 300)


### roBERTa
plot_roberta <- ggplot(data = conf_mat_roberta_df, aes(x = Manual, y = Transformer, fill = Frequency)) + 
  geom_tile() + 
  geom_text(aes(label = Frequency), vjust = 0.5, color = "black", size = 10, family = "LM_Roman") +
  theme_minimal() +
  scale_fill_gradient(low = "white", high = "purple") +
  theme(
    axis.text.x = element_text(family = "LM_Roman", size = 17.5),
    axis.text.y = element_text(angle = 90, hjust = 0.5, family = "LM_Roman", size = 17.5),
    axis.title.x = element_text(hjust = 0.5, family = "LM_Roman", size = 20),
    axis.title.y = element_text(hjust = 0.5, family = "LM_Roman", size = 20),
    legend.text = element_text(family = "LM_Roman", size = 17.5),
    legend.title = element_text(family = "LM_Roman", size = 17.5)
  ) +
  labs(x = "Manual Labels", y = "XLM-roBERTa-base Labels")

# Display in RStudio
print(plot_roberta)

# Save as image
ggsave(filename = "confusion_matrix_roberta.png", plot = plot_roberta, width = 6, height = 6, dpi = 300)


